Topic Modelling and Clustering a RSS feed

Modules used:

  • Scikit-Learn
  • nltk
  • feedparser

In [1]:
import feedparser
import re
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import string
from collections import Counter
stop_words = set(stopwords.words('english'))
stop_words.update(string.punctuation)

In [2]:
def getwords(html):
    '''
    Remove HTML, tokenize and lower the case
    '''
    txt = re.compile(r'<[^>]+>').sub('',html)
    word_list=[i.lower() for i in wordpunct_tokenize(txt) if i.lower() not in stop_words]
    return word_list

In [3]:
def getwordcounts(url):
    '''
    Returns list of blog posts
    '''
    d = feedparser.parse(url)
    wc = {}
    summary = []
    for e in d.entries:
        if 'summary' in e:
            summary.append(e.title + e.summary)
        else:
            summary.append(e.title + e.description)
    return summary

In [4]:
summary = getwordcounts('https://sethuiyer.wordpress.com/feed/atom/')

In [5]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=getwords,
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
tfidf_model = vectorizer.fit_transform(summary)

In [6]:
import collections
km_model = KMeans(n_clusters=3)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
 
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

In [7]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf_model)
feature_names = vectorizer.get_feature_names()

def print_top_words(model, feature_names, n_top_words=2):
    topic_list=[]
    for topic_idx, topic in enumerate(model.components_):
        topic_list.append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return topic_list

In [8]:
topic_list=print_top_words(nmf, vectorizer.get_feature_names(), 2)

In [9]:
for i in range(3):
    print("Topic Name: ",topic_list[i])
    print("Documents in the cluster: ",clustering[i])
    print('----------------')


Topic Name:  data started
Documents in the cluster:  [1, 2]
----------------
Topic Name:  extension limitless
Documents in the cluster:  [3, 4]
----------------
Topic Name:  classification vector
Documents in the cluster:  [0, 5]
----------------